Tutorial step 7: adding MMX optimizations |
The first step to optimizing our filter would be to use assembly code, but we'll spend some time on MMX optimizations first, because MMX can really speed this code up. The trick to using MMX is knowing when we can and can't use it; detection code isn't good enough because the user may have intentionally forced or disabled MMX support in VirtualDub's preferences dialog.
Checking to see if CPU optimizations are available |
Two functions in the FilterFunctions struct are useful here.
FPU optimizations are rare in pixel processing; the only VirtualDub filter that uses them is the bilinear resize filter, because it requires 6 64-bit multiplies per pixel, which are more quickly done with the FPU than the integer units. MMX optimizations are the biggie; use them if at all possible.
The FPU and MMX enable flags are guaranteed not to change between filters or during processing. It is highly recommended that you globally cache the flags during startProc processing, and read the global flag in runProc.
Adding MMX optimizations to our filter |
First, modify startProc to cache the MMX flag.
static bool g_MMXenabled;
int tutorialStartProc(FilterActivation *fa, const FilterFunctions *ff) {
MyFilterData *mfd = (MyFilterData *)fa->filter_data;
int i;
g_MMXenabled = ff->isMMXEnabled();
Now, write the MMX acceleration routine. For simplicity, it will only handle one scanline. I'm not usually very clear when writing MMX code, so don't feel bad if it takes a couple of passes to understand. ^^;;
void __declspec(naked) doscan_MMX(Pixel32 *dst, Pixel32 *src, int w, long frac, long bias, int fDouble) {
static const __int64 Rmask = 0x0000FFFF00000000i64;
__asm {
push ebp
push edi
push esi
push ebx
mov eax,[esp+4+16]
mov edx,[esp+8+16]
mov ecx,[esp+12+16]
neg ecx
shl ecx,2
sub eax,ecx
sub edx,ecx
movq mm6,Rmask
movd mm4,[esp+20+16]
psllq mm4,16
movd mm5,[esp+16+16]
punpcklwd mm5,mm5
pxor mm7,mm7
mov ebx,dword ptr [esp+24+16]
or ebx,ebx
jz xloop1
sub eax,ecx
xloop2:
movd mm0,[edx+ecx] ;mm0 = pixel
movq mm1,mm6 ;mm1 = R mask
punpcklbw mm0,mm7 ;unpack pixel to words
pand mm1,mm0 ;mm1 = red component
pmulhw mm0,mm5 ;scale green and blue
paddw mm1,mm4 ;add green bias
paddw mm0,mm1 ;add scaled green/blue
packuswb mm0,mm0 ;repack pixel to bytes
movq [eax+ecx*2],mm0 ;write 2 pixels
add ecx,4
jne xloop2
jmp short xit
xloop1:
movd mm0,[edx+ecx] ;mm0 = pixel
movq mm1,mm6 ;mm1 = R mask
punpcklbw mm0,mm7 ;unpack pixel to words
pand mm1,mm0 ;mm1 = red component
paddw mm0,mm0 ;double g/b channels beforehand
pmulhw mm0,mm5 ;scale green and blue
paddw mm1,mm4 ;add green bias
paddw mm0,mm1 ;add scaled green/blue
packuswb mm0,mm0 ;repack pixel to bytes
movd [eax+ecx],mm0 ;write pixel
add ecx,4
jne xloop1
xit:
pop ebx
pop esi
pop edi
pop ebp
ret
}
}
Finally, add the MMX optimizations to runProc.
int tutorialRunProc(const FilterActivation *fa, const FilterFunctions *ff) {
MyFilterData *mfd = (MyFilterData *)fa->filter_data;
PixDim w, h;
Pixel32 *src, *dst;
const Pixel32 *grn_tab = mfd->grn_tab;
const Pixel32 *blu_tab = mfd->blu_tab;
src = (Pixel32 *)fa->src.data;
dst = (Pixel32 *)fa->dst.data;
h = fa->src.h;
do {
w = fa->src.w;
if (g_MMXenabled) {
doscan_MMX(dst, src, w,
mfd->fThird ? 0x2AAA : 0x4000,
mfd->fThird ? 0x55 : 0x80,
mfd->fExpand);
src = (Pixel32 *)((char *)src + fa->src.pitch);
dst = (Pixel32 *)((char *)dst + fa->dst.pitch);
// double the routine for speed; an if would kill us in the
// inner loop, but in the outer loop it's ok
} else {
if (mfd->fExpand)
do {
Pixel32 old_pixel, new_pixel;
old_pixel = *src++;
new_pixel = (old_pixel & 0xFF0000)
+ grn_tab[(old_pixel>>8) & 0xff]
+ blu_tab[old_pixel & 0xff];
*dst++ = new_pixel;
*dst++ = new_pixel;
} while(--w);
else
do {
Pixel32 old_pixel, new_pixel;
old_pixel = *src++;
new_pixel = (old_pixel & 0xFF0000)
+ grn_tab[(old_pixel>>8) & 0xff]
+ blu_tab[old_pixel & 0xff];
*dst++ = new_pixel;
} while(--w);
src = (Pixel32 *)((char *)src + fa->src.modulo);
dst = (Pixel32 *)((char *)dst + fa->dst.modulo);
}
} while(--h);
if (g_MMXenabled)
__asm emms
return 0;
}
MMX optimizations basically just require the MMX code; you spend 99.99% of your time wringing out more speed. Two more important notes:
Onto the next, and the final, chapter of this tutorial!
back to main page
tutorial[6]: supporting user configuration
tutorial[8]: adding job (batch) support
VirtualDub external filter SDK 1.05 | ©1999-2001 Avery Lee <phaeron@virtualdub.org> |